In this document we present the joint analysis of the PASS1A metabolomics datasets.
Load the data from the cloud, including: phenotypic data, metabolomic datasets, and metabolomics dictionary.
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/supervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/gcp_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/association_analysis_methods.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/data_aux_functions.R")
source("~/Desktop/repos/motrpac/tools/prediction_ml_tools.R")
library(randomForest) # for classification tests
# Load the dmaqc data
merged_dmaqc_data = load_from_bucket("merged_dmaqc_data2019-10-15.RData",
"gs://bic_data_analysis/pass1a/pheno_dmaqc/",F)
merged_dmaqc_data = merged_dmaqc_data[[1]]
rownames(merged_dmaqc_data) = as.character(merged_dmaqc_data$vial_label)
# define the tissue variable
merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# define the time to freeze variable
merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min +
merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min
# col time vs. control
# df = data.frame(
# bid = merged_dmaqc_data$bid,
# edta_col_time = merged_dmaqc_data$calculated.variables.edta_coll_time,
# time_to_freeze = merged_dmaqc_data$time_to_freeze,
# is_control = merged_dmaqc_data$animal.key.is_control,
# tp = merged_dmaqc_data$animal.key.timepoint,
# tissue = merged_dmaqc_data$specimen.processing.sampletypedescription
# )
# df = unique(df)
# boxplot(edta_col_time/3600 ~ is_control,df)
# boxplot(edta_col_time/3600 - tp ~ is_control,df)
# wilcox.test(edta_col_time/3600 ~ is_control,df)
# blood freeze times
blood_samples =
merged_dmaqc_data$specimen.processing.sampletypedescription ==
"EDTA Plasma"
blood_freeze_time =
as.difftime(merged_dmaqc_data$specimen.processing.t_freeze,units = "mins") -
as.difftime(merged_dmaqc_data$specimen.processing.t_edtaspin,units="mins")
blood_freeze_time = as.numeric(blood_freeze_time)
time_to_freeze = merged_dmaqc_data$time_to_freeze[blood_samples] =
blood_freeze_time[blood_samples]
# Load our parsed metabolomics datasets
metabolomics_parsed_datasets = load_from_bucket(
file = "metabolomics_parsed_datasets_pass1a_external1.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")[[1]]
# # Read the dictionary
# dict_bucket =
# "gs://motrpac-external-release1-results/metabolomics_targeted/motrpac_metabolomics_data_dictionary-v1.1.5.txt"
# dict_download = get_single_file_from_bucket_to_local_dir(dict_bucket)
# metabolomics_dict = fread(dict_download[[1]],data.table = F)
# Plot cv vs means
library(gplots)
d = metabolomics_parsed_datasets[["white_adipose_powder,metab_u_hilicpos,unnamed"]]
dx = d$sample_data
CoV<-function(x){return(sd(x,na.rm = T)/mean(x,na.rm=T))}
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20)
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20)
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Plot number of NAs vs intensity mean
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
num_nas = rowSums(is.na(dx))
df = data.frame(Num_NAs = num_nas[inds],Mean_intensity = dmeans[inds])
plot(Num_NAs~Mean_intensity,df,cex=0.5,pch=20)
lines(lowess(Num_NAs~Mean_intensity,df),lty=2,lwd=2,col="blue")
cor(df$Num_NAs,df$Mean_intensity,method="spearman")
Define the variables to be adjusted for:
biospec_cols = c(
"acute.test.distance",
"calculated.variables.time_to_freeze",
# "calculated.variables.edta_coll_time", # no need - see code above for blood
"bid" # required for matching datasets
)
differential_analysis_cols = c(
"animal.registration.sex",
"animal.key.timepoint",
"animal.key.is_control"
)
pipeline_qc_cols = c("sample_order")
We go over each dataset and merge the named and unnamed subparts of the untargeted datasets. For targeted data - the release buckets we merge datasets that are from the same site and tissue.
Then the preprocessing of the new datasets is as follows: (1) log the data (values are raw intensities), (2) remove rows with \(>20\%\) missing values, and (3) impute missing values.
Important: datasets that do not have unique metabolite names or sample ids probably had errors while parsing and are therefore ignored. Also, untargeted with failed mergning of the unnamed and named subsets (e.g., due to different sample ids) are ignored as well.
Finally we add an additional version for each dataset by directly regressing out the sample order component. See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4757603/ for more details (trend correction is also called background correction, and this is done for each batch separately).
save_to_bucket(metabolomics_processed_datasets,
file="metabolomics_processed_datasets10282019.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
Copying file://metabolomics_processed_datasets10282019.RData [Content-Type=application/octet-stream]...
/ [0 files][ 0.0 B/236.4 MiB]
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.
-
- [0 files][792.0 KiB/236.4 MiB]
\
\ [0 files][ 1.6 MiB/236.4 MiB]
|
/
/ [0 files][ 2.3 MiB/236.4 MiB]
-
- [0 files][ 2.8 MiB/236.4 MiB]
\
|
| [0 files][ 3.6 MiB/236.4 MiB]
/
/ [0 files][ 4.4 MiB/236.4 MiB]
-
\
\ [0 files][ 5.2 MiB/236.4 MiB]
|
| [0 files][ 5.9 MiB/236.4 MiB]
/
-
- [0 files][ 6.7 MiB/236.4 MiB]
\
\ [0 files][ 7.5 MiB/236.4 MiB] 584.3 KiB/s
|
/
/ [0 files][ 8.3 MiB/236.4 MiB] 595.5 KiB/s
-
- [0 files][ 9.0 MiB/236.4 MiB] 615.9 KiB/s
\
|
| [0 files][ 9.8 MiB/236.4 MiB] 643.9 KiB/s
/
/ [0 files][ 10.6 MiB/236.4 MiB] 724.6 KiB/s
-
\
\ [0 files][ 11.3 MiB/236.4 MiB] 705.5 KiB/s
|
| [0 files][ 11.9 MiB/236.4 MiB] 659.8 KiB/s
/
-
- [0 files][ 12.4 MiB/236.4 MiB] 568.8 KiB/s
\
\ [0 files][ 13.2 MiB/236.4 MiB] 561.7 KiB/s
|
/
/ [0 files][ 13.9 MiB/236.4 MiB] 564.0 KiB/s
-
- [0 files][ 14.7 MiB/236.4 MiB] 601.3 KiB/s
\
|
| [0 files][ 15.5 MiB/236.4 MiB] 690.7 KiB/s
/
/ [0 files][ 16.2 MiB/236.4 MiB] 697.3 KiB/s
-
\
\ [0 files][ 17.0 MiB/236.4 MiB] 697.6 KiB/s
|
| [0 files][ 17.8 MiB/236.4 MiB] 697.0 KiB/s
/
-
- [0 files][ 18.6 MiB/236.4 MiB] 698.4 KiB/s
\
\ [0 files][ 19.3 MiB/236.4 MiB] 701.8 KiB/s
|
/
/ [0 files][ 20.1 MiB/236.4 MiB] 699.6 KiB/s
-
- [0 files][ 20.9 MiB/236.4 MiB] 701.1 KiB/s
\
|
| [0 files][ 21.7 MiB/236.4 MiB] 694.5 KiB/s
/
/ [0 files][ 22.4 MiB/236.4 MiB] 695.0 KiB/s
-
\
\ [0 files][ 23.2 MiB/236.4 MiB] 695.0 KiB/s
|
| [0 files][ 24.0 MiB/236.4 MiB] 710.8 KiB/s
/
-
- [0 files][ 24.8 MiB/236.4 MiB] 718.3 KiB/s
\
\ [0 files][ 25.5 MiB/236.4 MiB] 717.3 KiB/s
|
/
/ [0 files][ 26.3 MiB/236.4 MiB] 709.0 KiB/s
-
- [0 files][ 27.1 MiB/236.4 MiB] 707.2 KiB/s
\
|
| [0 files][ 27.8 MiB/236.4 MiB] 707.8 KiB/s
/
/ [0 files][ 28.6 MiB/236.4 MiB] 699.6 KiB/s
-
\
\ [0 files][ 29.4 MiB/236.4 MiB] 714.4 KiB/s
|
| [0 files][ 30.2 MiB/236.4 MiB] 716.4 KiB/s
/
-
- [0 files][ 30.9 MiB/236.4 MiB] 720.3 KiB/s
\
\ [0 files][ 31.7 MiB/236.4 MiB] 723.4 KiB/s
|
/
/ [0 files][ 32.5 MiB/236.4 MiB] 707.3 KiB/s
-
- [0 files][ 33.3 MiB/236.4 MiB] 706.9 KiB/s
\
|
| [0 files][ 34.0 MiB/236.4 MiB] 713.6 KiB/s
/
/ [0 files][ 34.8 MiB/236.4 MiB] 704.6 KiB/s
-
\
\ [0 files][ 35.6 MiB/236.4 MiB] 695.8 KiB/s
|
| [0 files][ 36.4 MiB/236.4 MiB] 697.3 KiB/s
/
-
- [0 files][ 37.1 MiB/236.4 MiB] 690.9 KiB/s
\
\ [0 files][ 37.9 MiB/236.4 MiB] 689.2 KiB/s
|
/
/ [0 files][ 38.7 MiB/236.4 MiB] 688.3 KiB/s
-
- [0 files][ 39.5 MiB/236.4 MiB] 683.8 KiB/s
\
|
| [0 files][ 40.2 MiB/236.4 MiB] 613.0 KiB/s
/
/ [0 files][ 40.7 MiB/236.4 MiB] 499.7 KiB/s
-
\
\ [0 files][ 41.5 MiB/236.4 MiB] 441.9 KiB/s
|
/
/ [0 files][ 42.0 MiB/236.4 MiB] 416.6 KiB/s
-
\
\ [0 files][ 42.5 MiB/236.4 MiB] 440.8 KiB/s
|
| [0 files][ 43.1 MiB/236.4 MiB] 398.9 KiB/s
/
/ [0 files][ 43.6 MiB/236.4 MiB] 415.6 KiB/s
-
- [0 files][ 44.1 MiB/236.4 MiB] 428.6 KiB/s
\
\ [0 files][ 44.6 MiB/236.4 MiB] 443.4 KiB/s
|
/
/ [0 files][ 45.4 MiB/236.4 MiB] 498.4 KiB/s
-
- [0 files][ 46.2 MiB/236.4 MiB] 568.0 KiB/s
\
|
| [0 files][ 46.9 MiB/236.4 MiB] 623.1 KiB/s
/
/ [0 files][ 47.7 MiB/236.4 MiB] 670.7 KiB/s
-
\
\ [0 files][ 48.5 MiB/236.4 MiB] 700.1 KiB/s
|
| [0 files][ 49.2 MiB/236.4 MiB] 696.4 KiB/s
/
-
- [0 files][ 50.0 MiB/236.4 MiB] 685.0 KiB/s
\
\ [0 files][ 50.8 MiB/236.4 MiB] 687.2 KiB/s
|
/
/ [0 files][ 51.6 MiB/236.4 MiB] 676.6 KiB/s
-
- [0 files][ 52.3 MiB/236.4 MiB] 681.6 KiB/s
\
|
| [0 files][ 52.9 MiB/236.4 MiB] 637.2 KiB/s
/
/ [0 files][ 53.4 MiB/236.4 MiB] 582.5 KiB/s
-
- [0 files][ 54.1 MiB/236.4 MiB] 546.8 KiB/s
\
|
| [0 files][ 54.9 MiB/236.4 MiB] 545.0 KiB/s
/
/ [0 files][ 55.7 MiB/236.4 MiB] 609.1 KiB/s
-
\
\ [0 files][ 56.5 MiB/236.4 MiB] 677.9 KiB/s
|
| [0 files][ 57.2 MiB/236.4 MiB] 704.3 KiB/s
/
-
- [0 files][ 58.0 MiB/236.4 MiB] 715.2 KiB/s
\
\ [0 files][ 58.8 MiB/236.4 MiB] 719.1 KiB/s
|
/
/ [0 files][ 59.6 MiB/236.4 MiB] 715.7 KiB/s
-
- [0 files][ 60.3 MiB/236.4 MiB] 694.8 KiB/s
\
|
| [0 files][ 61.1 MiB/236.4 MiB] 704.6 KiB/s
/
/ [0 files][ 61.9 MiB/236.4 MiB] 691.8 KiB/s
-
\
\ [0 files][ 62.7 MiB/236.4 MiB] 692.8 KiB/s
|
| [0 files][ 63.4 MiB/236.4 MiB] 693.7 KiB/s
/
-
- [0 files][ 64.2 MiB/236.4 MiB] 711.6 KiB/s
\
\ [0 files][ 65.0 MiB/236.4 MiB] 707.4 KiB/s
|
/
/ [0 files][ 65.7 MiB/236.4 MiB] 709.9 KiB/s
-
- [0 files][ 66.5 MiB/236.4 MiB] 703.2 KiB/s
\
|
| [0 files][ 67.3 MiB/236.4 MiB] 703.4 KiB/s
/
/ [0 files][ 68.1 MiB/236.4 MiB] 706.5 KiB/s
-
\
\ [0 files][ 68.8 MiB/236.4 MiB] 697.6 KiB/s
|
| [0 files][ 69.6 MiB/236.4 MiB] 701.8 KiB/s
/
-
- [0 files][ 70.4 MiB/236.4 MiB] 708.6 KiB/s
\
\ [0 files][ 71.2 MiB/236.4 MiB] 704.0 KiB/s
|
/
/ [0 files][ 71.9 MiB/236.4 MiB] 695.6 KiB/s
-
- [0 files][ 72.7 MiB/236.4 MiB] 692.6 KiB/s
\
|
| [0 files][ 73.5 MiB/236.4 MiB] 692.2 KiB/s
/
/ [0 files][ 74.3 MiB/236.4 MiB] 695.8 KiB/s
-
\
\ [0 files][ 75.0 MiB/236.4 MiB] 707.5 KiB/s
|
| [0 files][ 75.8 MiB/236.4 MiB] 701.9 KiB/s
/
-
- [0 files][ 76.6 MiB/236.4 MiB] 707.7 KiB/s
\
\ [0 files][ 77.3 MiB/236.4 MiB] 716.4 KiB/s
|
/
/ [0 files][ 78.1 MiB/236.4 MiB] 717.2 KiB/s
-
- [0 files][ 78.9 MiB/236.4 MiB] 715.5 KiB/s
\
|
| [0 files][ 79.7 MiB/236.4 MiB] 710.7 KiB/s
/
/ [0 files][ 80.4 MiB/236.4 MiB] 712.6 KiB/s
-
\
\ [0 files][ 81.2 MiB/236.4 MiB] 713.9 KiB/s
|
| [0 files][ 82.0 MiB/236.4 MiB] 717.2 KiB/s
/
-
- [0 files][ 82.8 MiB/236.4 MiB] 718.5 KiB/s
\
\ [0 files][ 83.5 MiB/236.4 MiB] 716.3 KiB/s
|
/
/ [0 files][ 84.3 MiB/236.4 MiB] 722.0 KiB/s
-
- [0 files][ 85.1 MiB/236.4 MiB] 706.5 KiB/s
\
|
| [0 files][ 85.9 MiB/236.4 MiB] 709.6 KiB/s
/
/ [0 files][ 86.6 MiB/236.4 MiB] 707.8 KiB/s
-
\
\ [0 files][ 87.4 MiB/236.4 MiB] 709.1 KiB/s
|
| [0 files][ 88.2 MiB/236.4 MiB] 701.4 KiB/s
/
-
- [0 files][ 89.0 MiB/236.4 MiB] 714.8 KiB/s
\
\ [0 files][ 89.7 MiB/236.4 MiB] 702.0 KiB/s
|
/
/ [0 files][ 90.5 MiB/236.4 MiB] 702.1 KiB/s
-
- [0 files][ 91.3 MiB/236.4 MiB] 696.5 KiB/s
\
|
| [0 files][ 92.0 MiB/236.4 MiB] 664.0 KiB/s
/
/ [0 files][ 92.8 MiB/236.4 MiB] 668.1 KiB/s
-
\
\ [0 files][ 93.6 MiB/236.4 MiB] 651.1 KiB/s
|
| [0 files][ 94.4 MiB/236.4 MiB] 652.6 KiB/s
/
-
- [0 files][ 95.1 MiB/236.4 MiB] 675.7 KiB/s
\
\ [0 files][ 95.9 MiB/236.4 MiB] 682.9 KiB/s
|
/
/ [0 files][ 96.7 MiB/236.4 MiB] 704.0 KiB/s
-
- [0 files][ 97.5 MiB/236.4 MiB] 713.1 KiB/s
\
|
| [0 files][ 98.2 MiB/236.4 MiB] 715.0 KiB/s
/
/ [0 files][ 99.0 MiB/236.4 MiB] 704.0 KiB/s
-
\
\ [0 files][ 99.8 MiB/236.4 MiB] 711.6 KiB/s
|
| [0 files][100.6 MiB/236.4 MiB] 700.3 KiB/s
/
-
- [0 files][101.3 MiB/236.4 MiB] 652.2 KiB/s
\
|
| [0 files][102.1 MiB/236.4 MiB] 658.6 KiB/s
/
/ [0 files][102.9 MiB/236.4 MiB] 648.2 KiB/s
-
\
\ [0 files][103.6 MiB/236.4 MiB] 655.1 KiB/s
|
| [0 files][104.4 MiB/236.4 MiB] 692.7 KiB/s
/
-
- [0 files][105.2 MiB/236.4 MiB] 683.7 KiB/s
\
\ [0 files][106.0 MiB/236.4 MiB] 695.2 KiB/s
|
/
/ [0 files][106.7 MiB/236.4 MiB] 681.1 KiB/s
-
- [0 files][107.5 MiB/236.4 MiB] 677.4 KiB/s
\
|
| [0 files][108.0 MiB/236.4 MiB] 636.9 KiB/s
/
/ [0 files][108.8 MiB/236.4 MiB] 626.4 KiB/s
-
\
\ [0 files][109.6 MiB/236.4 MiB] 644.2 KiB/s
|
| [0 files][110.3 MiB/236.4 MiB] 648.8 KiB/s
/
-
- [0 files][111.1 MiB/236.4 MiB] 705.2 KiB/s
\
\ [0 files][111.9 MiB/236.4 MiB] 692.5 KiB/s
|
/
/ [0 files][112.7 MiB/236.4 MiB] 698.7 KiB/s
-
- [0 files][113.4 MiB/236.4 MiB] 688.4 KiB/s
\
|
| [0 files][114.2 MiB/236.4 MiB] 689.0 KiB/s
/
/ [0 files][115.0 MiB/236.4 MiB] 699.3 KiB/s
-
\
\ [0 files][115.8 MiB/236.4 MiB] 692.1 KiB/s
|
| [0 files][116.5 MiB/236.4 MiB] 701.9 KiB/s
/
-
- [0 files][117.3 MiB/236.4 MiB] 702.3 KiB/s
\
\ [0 files][118.1 MiB/236.4 MiB] 698.0 KiB/s
|
/
/ [0 files][118.9 MiB/236.4 MiB] 707.2 KiB/s
-
- [0 files][119.6 MiB/236.4 MiB] 706.4 KiB/s
\
|
| [0 files][120.4 MiB/236.4 MiB] 717.3 KiB/s
/
/ [0 files][121.2 MiB/236.4 MiB] 710.4 KiB/s
-
\
\ [0 files][122.0 MiB/236.4 MiB] 701.5 KiB/s
|
| [0 files][122.7 MiB/236.4 MiB] 708.7 KiB/s
/
-
- [0 files][123.5 MiB/236.4 MiB] 708.1 KiB/s
\
\ [0 files][124.3 MiB/236.4 MiB] 709.2 KiB/s
|
/
/ [0 files][125.0 MiB/236.4 MiB] 715.2 KiB/s
-
- [0 files][125.8 MiB/236.4 MiB] 716.7 KiB/s
\
|
| [0 files][126.6 MiB/236.4 MiB] 720.7 KiB/s
/
/ [0 files][127.4 MiB/236.4 MiB] 705.5 KiB/s
-
\
\ [0 files][128.1 MiB/236.4 MiB] 703.0 KiB/s
|
| [0 files][128.9 MiB/236.4 MiB] 705.7 KiB/s
/
-
- [0 files][129.7 MiB/236.4 MiB] 689.2 KiB/s
\
\ [0 files][130.5 MiB/236.4 MiB] 707.1 KiB/s
|
/
/ [0 files][131.2 MiB/236.4 MiB] 700.2 KiB/s
-
- [0 files][132.0 MiB/236.4 MiB] 698.2 KiB/s
\
|
| [0 files][132.8 MiB/236.4 MiB] 700.4 KiB/s
/
/ [0 files][133.6 MiB/236.4 MiB] 711.4 KiB/s
-
\
\ [0 files][134.3 MiB/236.4 MiB] 708.8 KiB/s
|
| [0 files][135.1 MiB/236.4 MiB] 706.8 KiB/s
/
-
- [0 files][135.9 MiB/236.4 MiB] 705.8 KiB/s
\
\ [0 files][136.6 MiB/236.4 MiB] 708.4 KiB/s
|
/
/ [0 files][137.4 MiB/236.4 MiB] 709.5 KiB/s
-
- [0 files][138.2 MiB/236.4 MiB] 716.4 KiB/s
\
|
| [0 files][139.0 MiB/236.4 MiB] 718.1 KiB/s
/
/ [0 files][139.7 MiB/236.4 MiB] 710.1 KiB/s
-
\
\ [0 files][140.5 MiB/236.4 MiB] 707.6 KiB/s
|
| [0 files][141.3 MiB/236.4 MiB] 716.8 KiB/s
/
-
- [0 files][142.1 MiB/236.4 MiB] 684.2 KiB/s
\
\ [0 files][142.8 MiB/236.4 MiB] 687.6 KiB/s
|
/
/ [0 files][143.6 MiB/236.4 MiB] 691.2 KiB/s
-
- [0 files][144.4 MiB/236.4 MiB] 689.4 KiB/s
\
|
| [0 files][145.2 MiB/236.4 MiB] 692.5 KiB/s
/
/ [0 files][145.9 MiB/236.4 MiB] 699.6 KiB/s
-
\
\ [0 files][146.7 MiB/236.4 MiB] 705.8 KiB/s
|
| [0 files][147.5 MiB/236.4 MiB] 706.6 KiB/s
/
/ [0 files][148.0 MiB/236.4 MiB] 617.0 KiB/s
-
\
\ [0 files][148.5 MiB/236.4 MiB] 484.8 KiB/s
|
/
/ [0 files][149.3 MiB/236.4 MiB] 482.3 KiB/s
-
- [0 files][150.1 MiB/236.4 MiB] 492.7 KiB/s
\
|
| [0 files][150.8 MiB/236.4 MiB] 582.4 KiB/s
/
/ [0 files][151.6 MiB/236.4 MiB] 712.8 KiB/s
-
\
\ [0 files][152.4 MiB/236.4 MiB] 710.2 KiB/s
|
| [0 files][153.1 MiB/236.4 MiB] 710.7 KiB/s
/
-
- [0 files][153.9 MiB/236.4 MiB] 707.4 KiB/s
\
\ [0 files][154.7 MiB/236.4 MiB] 711.6 KiB/s
|
/
/ [0 files][155.5 MiB/236.4 MiB] 711.8 KiB/s
-
- [0 files][156.2 MiB/236.4 MiB] 690.1 KiB/s
\
|
| [0 files][157.0 MiB/236.4 MiB] 697.7 KiB/s
/
/ [0 files][157.8 MiB/236.4 MiB] 692.5 KiB/s
-
\
\ [0 files][158.6 MiB/236.4 MiB] 687.6 KiB/s
|
| [0 files][159.3 MiB/236.4 MiB] 703.6 KiB/s
/
-
- [0 files][160.1 MiB/236.4 MiB] 706.4 KiB/s
\
\ [0 files][160.9 MiB/236.4 MiB] 717.4 KiB/s
|
/
/ [0 files][161.7 MiB/236.4 MiB] 715.4 KiB/s
-
- [0 files][162.4 MiB/236.4 MiB] 708.1 KiB/s
\
|
| [0 files][163.2 MiB/236.4 MiB] 710.4 KiB/s
/
/ [0 files][164.0 MiB/236.4 MiB] 709.6 KiB/s
-
\
\ [0 files][164.7 MiB/236.4 MiB] 699.7 KiB/s
|
| [0 files][165.5 MiB/236.4 MiB] 724.1 KiB/s
/
-
- [0 files][166.3 MiB/236.4 MiB] 716.4 KiB/s
\
\ [0 files][167.1 MiB/236.4 MiB] 722.8 KiB/s
|
/
/ [0 files][167.8 MiB/236.4 MiB] 711.3 KiB/s
-
- [0 files][168.6 MiB/236.4 MiB] 696.3 KiB/s
\
|
| [0 files][169.4 MiB/236.4 MiB] 700.8 KiB/s
/
/ [0 files][170.2 MiB/236.4 MiB] 695.2 KiB/s
-
\
\ [0 files][170.9 MiB/236.4 MiB] 698.5 KiB/s
|
| [0 files][171.7 MiB/236.4 MiB] 712.5 KiB/s
/
-
- [0 files][172.5 MiB/236.4 MiB] 709.0 KiB/s
\
\ [0 files][173.3 MiB/236.4 MiB] 718.5 KiB/s
|
/
/ [0 files][174.0 MiB/236.4 MiB] 706.6 KiB/s
-
- [0 files][174.8 MiB/236.4 MiB] 715.6 KiB/s
\
|
| [0 files][175.6 MiB/236.4 MiB] 716.4 KiB/s
/
/ [0 files][176.3 MiB/236.4 MiB] 713.3 KiB/s
-
\
\ [0 files][177.1 MiB/236.4 MiB] 709.5 KiB/s
|
| [0 files][177.9 MiB/236.4 MiB] 719.8 KiB/s
/
-
- [0 files][178.7 MiB/236.4 MiB] 718.8 KiB/s
\
\ [0 files][179.4 MiB/236.4 MiB] 713.9 KiB/s
|
/
/ [0 files][180.2 MiB/236.4 MiB] 708.2 KiB/s
-
- [0 files][181.0 MiB/236.4 MiB] 700.1 KiB/s
\
|
| [0 files][181.8 MiB/236.4 MiB] 710.3 KiB/s
/
/ [0 files][182.5 MiB/236.4 MiB] 706.0 KiB/s
-
\
\ [0 files][183.3 MiB/236.4 MiB] 701.7 KiB/s
|
| [0 files][184.1 MiB/236.4 MiB] 709.6 KiB/s
/
-
- [0 files][184.9 MiB/236.4 MiB] 704.7 KiB/s
\
\ [0 files][185.6 MiB/236.4 MiB] 696.4 KiB/s
|
/
/ [0 files][186.4 MiB/236.4 MiB] 689.6 KiB/s
-
- [0 files][187.2 MiB/236.4 MiB] 687.6 KiB/s
\
|
| [0 files][188.0 MiB/236.4 MiB] 665.9 KiB/s
/
/ [0 files][188.5 MiB/236.4 MiB] 536.5 KiB/s
-
- [0 files][189.0 MiB/236.4 MiB] 492.5 KiB/s
\
|
| [0 files][189.8 MiB/236.4 MiB] 480.4 KiB/s
/
/ [0 files][190.5 MiB/236.4 MiB] 517.4 KiB/s
-
\
\ [0 files][191.3 MiB/236.4 MiB] 625.5 KiB/s
|
| [0 files][192.1 MiB/236.4 MiB] 674.9 KiB/s
/
-
- [0 files][192.8 MiB/236.4 MiB] 716.5 KiB/s
\
\ [0 files][193.6 MiB/236.4 MiB] 710.8 KiB/s
|
/
/ [0 files][194.4 MiB/236.4 MiB] 697.0 KiB/s
-
- [0 files][195.2 MiB/236.4 MiB] 700.0 KiB/s
\
|
| [0 files][195.9 MiB/236.4 MiB] 704.5 KiB/s
/
/ [0 files][196.7 MiB/236.4 MiB] 706.0 KiB/s
-
\
\ [0 files][197.5 MiB/236.4 MiB] 700.0 KiB/s
|
| [0 files][198.3 MiB/236.4 MiB] 700.1 KiB/s
/
-
- [0 files][199.0 MiB/236.4 MiB] 701.3 KiB/s
\
\ [0 files][199.8 MiB/236.4 MiB] 697.6 KiB/s
|
/
/ [0 files][200.6 MiB/236.4 MiB] 701.9 KiB/s
-
- [0 files][201.4 MiB/236.4 MiB] 703.8 KiB/s
\
|
| [0 files][202.1 MiB/236.4 MiB] 696.0 KiB/s
/
/ [0 files][202.9 MiB/236.4 MiB] 704.8 KiB/s
-
\
\ [0 files][203.7 MiB/236.4 MiB] 700.3 KiB/s
|
| [0 files][204.5 MiB/236.4 MiB] 696.0 KiB/s
/
-
- [0 files][205.2 MiB/236.4 MiB] 702.5 KiB/s
\
\ [0 files][206.0 MiB/236.4 MiB] 698.0 KiB/s
|
/
/ [0 files][206.8 MiB/236.4 MiB] 640.4 KiB/s
-
- [0 files][207.3 MiB/236.4 MiB] 498.1 KiB/s
\
|
| [0 files][207.8 MiB/236.4 MiB] 233.8 KiB/s
/
/ [0 files][208.1 MiB/236.4 MiB] 189.7 KiB/s
-
- [0 files][208.3 MiB/236.4 MiB] 123.7 KiB/s
\
\ [0 files][208.6 MiB/236.4 MiB] 130.0 KiB/s
|
/
/ [0 files][209.1 MiB/236.4 MiB] 208.3 KiB/s
-
- [0 files][209.6 MiB/236.4 MiB] 223.0 KiB/s
\
\ [0 files][209.9 MiB/236.4 MiB] 228.4 KiB/s
|
| [0 files][210.4 MiB/236.4 MiB] 211.9 KiB/s
/
/ [0 files][210.6 MiB/236.4 MiB] 238.9 KiB/s
-
\
\ [0 files][211.2 MiB/236.4 MiB] 291.5 KiB/s
|
| [0 files][211.4 MiB/236.4 MiB] 300.3 KiB/s
/
-
- [0 files][211.9 MiB/236.4 MiB] 321.0 KiB/s
\
\ [0 files][212.2 MiB/236.4 MiB] 279.1 KiB/s
|
| [0 files][212.7 MiB/236.4 MiB] 164.4 KiB/s
/
/ [0 files][213.0 MiB/236.4 MiB] 168.9 KiB/s
-
- [0 files][213.5 MiB/236.4 MiB] 241.9 KiB/s
\
\ [0 files][213.7 MiB/236.4 MiB] 225.1 KiB/s
|
| [0 files][214.2 MiB/236.4 MiB] 287.7 KiB/s
/
/ [0 files][214.8 MiB/236.4 MiB] 352.3 KiB/s
-
\
\ [0 files][215.5 MiB/236.4 MiB] 512.4 KiB/s
|
| [0 files][216.3 MiB/236.4 MiB] 554.0 KiB/s
/
-
- [0 files][217.1 MiB/236.4 MiB] 616.0 KiB/s
\
\ [0 files][217.9 MiB/236.4 MiB] 675.8 KiB/s
|
/
/ [0 files][218.6 MiB/236.4 MiB] 682.7 KiB/s
-
- [0 files][219.4 MiB/236.4 MiB] 685.8 KiB/s
\
|
| [0 files][220.2 MiB/236.4 MiB] 681.2 KiB/s
/
/ [0 files][221.0 MiB/236.4 MiB] 662.3 KiB/s
-
\
\ [0 files][221.7 MiB/236.4 MiB] 673.1 KiB/s
|
| [0 files][222.5 MiB/236.4 MiB] 675.1 KiB/s
/
-
- [0 files][223.3 MiB/236.4 MiB] 686.9 KiB/s
\
\ [0 files][224.0 MiB/236.4 MiB] 701.3 KiB/s
|
/
/ [0 files][224.8 MiB/236.4 MiB] 705.9 KiB/s
-
- [0 files][225.6 MiB/236.4 MiB] 702.4 KiB/s
\
|
| [0 files][226.4 MiB/236.4 MiB] 703.7 KiB/s
/
/ [0 files][227.1 MiB/236.4 MiB] 711.0 KiB/s
-
\
\ [0 files][227.9 MiB/236.4 MiB] 709.1 KiB/s
|
| [0 files][228.7 MiB/236.4 MiB] 710.7 KiB/s
/
-
- [0 files][229.5 MiB/236.4 MiB] 714.9 KiB/s
\
\ [0 files][230.2 MiB/236.4 MiB] 714.6 KiB/s
|
/
/ [0 files][231.0 MiB/236.4 MiB] 711.7 KiB/s
-
- [0 files][231.8 MiB/236.4 MiB] 711.9 KiB/s
\
|
| [0 files][232.6 MiB/236.4 MiB] 712.1 KiB/s
/
/ [0 files][233.3 MiB/236.4 MiB] 709.1 KiB/s
-
\
\ [0 files][234.1 MiB/236.4 MiB] 718.4 KiB/s
|
| [0 files][234.9 MiB/236.4 MiB] 713.3 KiB/s
/
-
- [0 files][235.6 MiB/236.4 MiB] 719.2 KiB/s
\
\ [0 files][236.4 MiB/236.4 MiB] 713.1 KiB/s
|
| [1 files][236.4 MiB/236.4 MiB] 634.6 KiB/s
Operation completed over 1 objects/236.4 MiB.
Alternatively load the result from the bucket to save time:
metabolomics_processed_datasets = load_from_bucket(
file="metabolomics_processed_datasets10282019.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/"
)[[1]]
Untargeted data are typically logged and analyzed using linear models. On the other hand, concentration data are analyzed with the same type of models but using the original data. This raises a problem if we wish to compare exact statistics from these data. In this section we perform residual analysis for single metabolites. Our goal is to identify if concentration data behaves “normally” when not logged. The analysis below examines the residuals of the data and uses qq-plots to determine if a log transformation is indeed required.
Compare overlaps, effect sizes, and correlations within tissues. Compare targeted-untargeted pairs only.
# helper function to transform a metabolomics matrix
# to that of its motrpac compound ids
extract_metab_data_from_row_annot<-function(x,row_annot_x){
# get the coloumn that has the row names
int_sizes = apply(row_annot_x,2,function(x,y)length(intersect(x,y)),y=rownames(x))
ind = which(int_sizes==max(int_sizes,na.rm = T))[1]
row_annot_x = row_annot_x[is.element(row_annot_x[,ind],set=rownames(x)),]
rownames(row_annot_x) = row_annot_x[,ind]
shared = intersect(rownames(row_annot_x),rownames(x))
x = x[shared,]
row_annot_x = row_annot_x[shared,]
rownames(x) = row_annot_x$motrpac_comp_name
return(x)
}
single_metabolite_corrs = list()
single_metabolite_de = c()
named2covered_shared_metabolites = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
single_metabolite_corrs[[nn1]] = list()
named2covered_shared_metabolites[[nn1]] = NULL
for(nn2 in names(metabolomics_processed_datasets)){
if(nn2 == nn1){next}
if(!grepl("untargeted",nn2)){next}
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
nn2_dataset = strsplit(nn2,split=",")[[1]][2]
if(nn1_tissue!=nn2_tissue){next}
print(nn2)
# get the numeric dataset
x = metabolomics_processed_datasets[[nn1]]$sample_data
# x = log2(x+1)
y = metabolomics_processed_datasets[[nn2]]$sample_data
# print(paste("dataset1:",nn1))
# print(range(x))
# print(paste("dataset2:",nn2))
# print(range(y))
row_annot_x = metabolomics_processed_datasets[[nn1]]$row_annot
row_annot_y = metabolomics_processed_datasets[[nn2]]$row_annot
# transform metabolite names to the motrpac comp name
x = extract_metab_data_from_row_annot(x,row_annot_x)
y = extract_metab_data_from_row_annot(y,row_annot_y)
# align the sample sets
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# step 1: merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
# step 2: use the shared bio ids
shared_bids = as.character(intersect(colnames(y),colnames(x)))
x = as.matrix(x[,shared_bids])
y = as.matrix(y[,shared_bids])
# At this point x and y are over the same BIDs, now we add the metadata
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[shared_bids,]
# get the shared matebolites
shared_metabolites = intersect(rownames(x),rownames(y))
shared_metabolites = na.omit(shared_metabolites)
if(length(shared_metabolites)==0){next}
named2covered_shared_metabolites[[nn1]] = union(
named2covered_shared_metabolites[[nn1]],
shared_metabolites
)
# Get statistics
if(length(shared_metabolites)>1){
corrs =cor(t(x[shared_metabolites,]),
t(y[shared_metabolites,]),method = "spearman")
}
else{
corrs = cor(x[shared_metabolites,],
y[shared_metabolites,],method = "spearman")
}
# take the covariates, ignore distances
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
# # scale
# for(cov_name in names(curr_covs)){
# curr_covs[[cov_name]] = scale(curr_covs[[cov_name]],center = T,scale = T)
# }
curr_covs$sex = y_meta$animal.registration.sex
# differential analysis
for(tp in unique(y_meta$animal.key.timepoint)){
resx = t(apply(
matrix(x[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F
))
resy = t(apply(
matrix(y[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F
))
# Add dataset information, time point, tissue
added_columns = matrix(cbind(
rep(nn1_dataset,length(shared_metabolites)),
rep(nn2_dataset,length(shared_metabolites)),
shared_metabolites,
rep(tp,length(shared_metabolites)),
rep(nn1_tissue,length(shared_metabolites))
),nrow=length(shared_metabolites))
resx = cbind(resx,rep(T,nrow(resx)))
colnames(resx)[ncol(resx)] = "is_targeted"
resy = cbind(resy,rep(F,nrow(resy)))
colnames(resy)[ncol(resy)] = "is_targeted"
if(nrow(resx)>1){
resx = cbind(added_columns[,-2],resx)
resy = cbind(added_columns[,-1],resy)
}
else{
resx = c(added_columns[,-2],resx)
resy = c(added_columns[,-1],resy)
}
single_metabolite_de = rbind(single_metabolite_de,resx)
single_metabolite_de = rbind(single_metabolite_de,resy)
}
single_metabolite_corrs[[nn1]][[nn2]] = corrs
}
}
[1] "plasma,metab_u_hilicpos,untargeted"
[1] "plasma,metab_u_ionpneg,untargeted"
[1] "plasma,metab_u_lrpneg,untargeted"
[1] "plasma,metab_u_lrppos,untargeted"
[1] "plasma,metab_u_rpneg,untargeted"
[1] "plasma,metab_u_rppos,untargeted"
[1] "plasma,metab_u_hilicpos,untargeted"
[1] "plasma,metab_u_ionpneg,untargeted"
[1] "plasma,metab_u_lrpneg,untargeted"
[1] "plasma,metab_u_lrppos,untargeted"
[1] "plasma,metab_u_rpneg,untargeted"
[1] "plasma,metab_u_rppos,untargeted"
[1] "plasma,metab_u_hilicpos,untargeted"
[1] "plasma,metab_u_ionpneg,untargeted"
[1] "plasma,metab_u_lrpneg,untargeted"
[1] "plasma,metab_u_lrppos,untargeted"
[1] "plasma,metab_u_rpneg,untargeted"
[1] "plasma,metab_u_rppos,untargeted"
[1] "plasma,metab_u_hilicpos,untargeted"
[1] "plasma,metab_u_ionpneg,untargeted"
[1] "plasma,metab_u_lrpneg,untargeted"
[1] "plasma,metab_u_lrppos,untargeted"
[1] "plasma,metab_u_rpneg,untargeted"
[1] "plasma,metab_u_rppos,untargeted"
[1] "plasma,metab_u_hilicpos,untargeted"
[1] "plasma,metab_u_ionpneg,untargeted"
[1] "plasma,metab_u_lrpneg,untargeted"
[1] "plasma,metab_u_lrppos,untargeted"
[1] "plasma,metab_u_rpneg,untargeted"
[1] "plasma,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
[1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
[1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
[1] "gastrocnemius_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "liver_powder,metab_u_hilicpos,untargeted"
[1] "liver_powder,metab_u_ionpneg,untargeted"
[1] "liver_powder,metab_u_lrpneg,untargeted"
[1] "liver_powder,metab_u_lrppos,untargeted"
[1] "liver_powder,metab_u_rpneg,untargeted"
[1] "liver_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
[1] "white_adipose_powder,metab_u_hilicpos,untargeted"
[1] "white_adipose_powder,metab_u_lrpneg,untargeted"
[1] "white_adipose_powder,metab_u_lrppos,untargeted"
[1] "white_adipose_powder,metab_u_rpneg,untargeted"
[1] "white_adipose_powder,metab_u_rppos,untargeted"
single_metabolite_de = data.frame(single_metabolite_de)
names(single_metabolite_de) = c(
"dataset","metabolite","tp","tissue",
"Est","Std","Tstat","Pvalue","is_targeted")
for(col in names(single_metabolite_de)[-c(1:4)]){
single_metabolite_de[[col]] = as.numeric(
as.character(single_metabolite_de[[col]]))
}
for(col in names(single_metabolite_de)[1:4]){
single_metabolite_de[[col]] =
as.character(single_metabolite_de[[col]])
}
We next transform the data above into tables that contain data for each combination of metabolite, time point, and tissue. These are then used for two meta-analyses: (1) a simple random effects analysis, and (2) random effects with a binary covariate indicating if a dataset is targeted or untargeted.
rownames(single_metabolite_de) = NULL
for(nn in names(single_metabolite_de)){
ndig = 5
if(grepl("pval",nn,ignore.case = T)){
ndig = 10
}
if(is.numeric(single_metabolite_de[[nn]])){
single_metabolite_de[[nn]] = round(single_metabolite_de[[nn]],digits = ndig)
}
}
single_metabolite_de = unique(single_metabolite_de)
library(metafor)
meta_analysis_stats = list()
for(tissue in unique(single_metabolite_de$tissue)){
for(tp in unique(single_metabolite_de$tp)){
curr_subset = single_metabolite_de[
single_metabolite_de$tissue==tissue &
single_metabolite_de$tp==tp,]
for(metabolite in unique(curr_subset$metabolite)){
curr_met_data = curr_subset[
curr_subset$metabolite==metabolite,]
curr_met_data$var = curr_met_data$Std^2
re_model1 = NULL;re_model2=NULL;re_model_tar = NULL
try({re_model1 = rma.uni(curr_met_data$Est,curr_met_data$var)})
try({re_model2 = rma.mv(curr_met_data$Est,curr_met_data$var,
mods=curr_met_data$is_targeted)})
try({re_model_tar = rma.uni(
curr_met_data[curr_met_data$is_targeted==1,"Est"],
curr_met_data[curr_met_data$is_targeted==1,"var"]
)})
meta_analysis_stats[[paste(metabolite,tissue,tp,sep=",")]] =
list(curr_met_data=curr_met_data,re_model1=re_model1,
re_model2 = re_model2)
}
}
}
We now show some plots to summarize the comparison. We plot the overall average correlation between the platforms (within tissues). However, going into a single metabolite comparison in detail (again, within tissues), for each metabolite, in each time point we perform a meta-analysis of the effects and examine the significance and heterogeneity.
# Do we have nice coverages of the named datasets?
library(ggplot2)
dataset2num_metabolites = sapply(metabolomics_processed_datasets,
function(x)nrow(x$sample_data))
named_dataset_coverage = sapply(named2covered_shared_metabolites,length)
named_dataset_coverage = data.frame(
name = names(named_dataset_coverage),
percentage = named_dataset_coverage /
dataset2num_metabolites[names(named_dataset_coverage)],
count = named_dataset_coverage,
total = dataset2num_metabolites[names(named_dataset_coverage)]
)
# add datasets with no coverage
all_targeted_datasets = names(metabolomics_processed_datasets)
all_targeted_datasets = all_targeted_datasets[!grepl("untarg",all_targeted_datasets)]
zero_coverage_datasets = setdiff(all_targeted_datasets,
named_dataset_coverage$name)
zero_coverage_datasets = data.frame(
name = zero_coverage_datasets,
percentage = rep(0,length(zero_coverage_datasets)),
count = rep(0,length(zero_coverage_datasets)),
total = dataset2num_metabolites[zero_coverage_datasets]
)
named_dataset_coverage = rbind(named_dataset_coverage,
zero_coverage_datasets)
named_dataset_coverage =
named_dataset_coverage[order(as.character(named_dataset_coverage$name)),]
print(ggplot(named_dataset_coverage, aes(x=name, y=percentage)) +
geom_bar(stat = "identity",width=0.2) + coord_flip() +
geom_text(data=named_dataset_coverage,
aes(name, percentage+0.05, label=count),
position = position_dodge(width=0.9),
size=4) +
ggtitle("Named datasets: coverage by untargeted datasets"))
# Next examine the Spearman correlations between platforms
extract_diag_vs_non_diag<-function(corrs,func=mean,...){
if(length(corrs)==1){
return(c(same=func(corrs,...),other=NA))
}
same = func(diag(corrs),...)
other = func(
c(corrs[lower.tri(corrs,diag = F)]),...)
return(c(same=same,other=other))
}
# tar_dataset2tissue = sapply(names(single_metabolite_corrs),
# function(x)strsplit(x,split=",")[[1]][1])
# tar_dataset2tissue = gsub("_powder","",tar_dataset2tissue)
for(tar_dataset in names(single_metabolite_corrs)){
l = single_metabolite_corrs[[tar_dataset]]
if(length(l)==0){next}
corr_info = as.data.frame(t(sapply(l, extract_diag_vs_non_diag)))
corr_sd = as.data.frame(t(sapply(l, extract_diag_vs_non_diag,func=sd)))
rownames(corr_info) = sapply(rownames(corr_info),
function(x)strsplit(x,split=",")[[1]][2])
rownames(corr_info) = gsub("metab_u_","",rownames(corr_info))
rownames(corr_sd) = rownames(corr_info)
corr_info$dataset = rownames(corr_info)
corr_sd$dataset = corr_info$dataset
corr_info = melt(corr_info)
corr_sd = melt(corr_sd)
corr_info$sd = corr_sd$value
print(
ggplot(corr_info, aes(x=dataset, y=value, fill=variable)) +
geom_bar(position=position_dodge(), stat="identity", colour='black') +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd),na.rm=T,
width=.2,position=position_dodge(.9)) +
ggtitle(tar_dataset) + xlab("Untargeted dataset") + ylab("Spearman") +
labs(fill = "Pair type") +
theme(legend.position="top",legend.direction = "horizontal")
)
}
Using dataset as id variables
Using dataset as id variables
# Look at the meta-analysis results
I2_scores = sapply(meta_analysis_stats,function(x)x$re_model1$I2)
I2_scores = unlist(I2_scores[sapply(I2_scores,length)>0])
hist(I2_scores)
betas = sapply(meta_analysis_stats,function(x)x$re_model1$beta[1,1])
betas = unlist(betas[names(I2_scores)])
pvals = sapply(meta_analysis_stats,function(x)x$re_model1$pval)
pvals = unlist(pvals[names(I2_scores)])
targeted_diff_p =
sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])
targeted_diff_p = unlist(targeted_diff_p[names(I2_scores)])
plot(betas,I2_scores)
plot(-log10(pvals),I2_scores)
plot(-log10(targeted_diff_p),I2_scores)
plot(-log10(targeted_diff_p),-log10(pvals))
# plot examples
agree_example = names(sample(which(pvals< 0.001 & I2_scores < 20))[1])
forest(meta_analysis_stats[[agree_example]]$re_model1,
slab = meta_analysis_stats[[agree_example]][[1]][,1],
main = agree_example,xlab = "Log fc",
col = "blue",cex = 1.1)
disagree_example = names(sample(which(targeted_diff_p< 0.001)))[1]
forest(meta_analysis_stats[[disagree_example]]$re_model1,
slab = meta_analysis_stats[[disagree_example]][[1]][,1],
main = disagree_example,xlab = "Log fc",
col = "blue",cex = 1.1)
Use 10-fold cross validation for analysis within tissues.
nfolds = 10
prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
for(nn2 in names(metabolomics_processed_datasets)){
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
if(nn1_tissue!=nn2_tissue){next}
print(paste("training set:",nn2))
print(paste("test set:",nn1))
# get the data, merge samples by bid if necessary
y = metabolomics_processed_datasets[[nn1]]$sample_data
x = metabolomics_processed_datasets[[nn2]]$sample_data
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
# # remove metadata variables with too many NAs
# na_counts = apply(is.na(y_meta),2,sum)
# y_meta = y_meta[,na_counts/nrow(y_meta) == 0]
rownames(y_meta) = y_meta$bid
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
if(ncol(y)>1000){next}
cov_cols = intersect(colnames(y_meta),
setdiff(biospec_cols,"bid"))
covs = as.matrix(y_meta[colnames(y),cov_cols])
# regress the covariates out - simple linear analysis
y = lm_regress_out_matrix(t(y),covs)
# Prepare the input for the ML part
shared_bids = as.character(intersect(rownames(y),colnames(x)))
x = t(as.matrix(x[,shared_bids]))
y = as.matrix(y[shared_bids,])
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
print(j)
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
# model = randomForest(tr_yi,x=tr_x,ntree = 20)
# te_preds = predict(model,newdata = te_x)
model = feature_selection_wrapper(tr_x,tr_yi,
coeff_of_var,randomForest,
topK = numFeatures,ntree=50)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
currname = paste(nn1,nn2,sep=";")
prediction_analysis_results[[currname]] = list(
preds = preds,real=real
)
}
}
names(prediction_analysis_results)
cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
print(nn1)
y = metabolomics_processed_datasets[[nn1]]$sample_data
y_vials = colnames(y)
bid_y = merged_dmaqc_data[colnames(y),"bid"]
colnames(y) = bid_y
y = t(as.matrix(y))
if(ncol(y)>1000){next}
cov_cols = c("animal.registration.sex",
"acute.test.weight",
"acute.test.distance",
"animal.key.timepoint")
covs = merged_dmaqc_data[y_vials,cov_cols]
x = covs
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
print(j)
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
model = randomForest(tr_yi,x=tr_x,ntree = 20)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
cov_prediction_analysis_results[[nn1]] = list(
preds = preds,real=real
)
}
results_metrics = c()
for(nn in names(prediction_analysis_results)){
preds = prediction_analysis_results[[nn]]$preds
real = prediction_analysis_results[[nn]]$real
rhos1 = diag(cor(preds,real))
rhos2 = diag(cor(preds,real,method="spearman"))
SEs = (preds-real)^2
mse = mean(SEs)
normSEs = SEs / apply(real,2,sd)
curr_scores = c(mean(rhos1),mean(rhos2),min(rhos1),min(rhos2),
mse,mean(normSEs))
names(curr_scores) = c("mean_rho","mean_spearman_rho","min_rho","min_spearman_rho",
"MSE","mean MSE/SD")
results_metrics = rbind(results_metrics,
curr_scores)
rownames(results_metrics)[nrow(results_metrics)] = nn
}
cov_results_metrics = c()
for(nn in names(cov_prediction_analysis_results)){
preds = cov_prediction_analysis_results[[nn]]$preds
real = cov_prediction_analysis_results[[nn]]$real
rhos1 = diag(cor(preds,real))
rhos2 = diag(cor(preds,real,method="spearman"))
SEs = (preds-real)^2
mse = mean(SEs)
normSEs = SEs / apply(real,2,sd)
curr_scores = c(mean(rhos1),mean(rhos2),min(rhos1),min(rhos2),
mse,mean(normSEs))
names(curr_scores) = c("mean_rho","mean_spearman_rho","min_rho","min_spearman_rho",
"MSE","mean MSE/SD")
cov_results_metrics = rbind(cov_results_metrics,
curr_scores)
rownames(cov_results_metrics)[nrow(cov_results_metrics)] = nn
}
# Some boxplots
pred_targets = sapply(names(prediction_analysis_results),function(x)
strsplit(x,split = ";")[[1]][1])
target = "liver_powder,metab_t_tca,named"
curr_res = unique(results_metrics[pred_targets == target,])
rownames(curr_res) = gsub(target,"",rownames(curr_res))
rownames(curr_res) = gsub(";","",rownames(curr_res))
rownames(curr_res)[rownames(curr_res)==""] = target
cov_baseline = cov_results_metrics[target,]
par(mar = c(8,4,2,2))
cols = rep("blue",nrow(curr_res))
cols[rownames(curr_res)==target] = "black"
plt = barplot(curr_res[,4],beside = T,xaxt="n",legend=F,
ylab="Min rho (Spearman)",
ylim = c(0.5,1),xpd=F,col=cols,
main = target)
text(plt, par("usr")[3], labels = rownames(curr_res),
srt = 45, adj = c(1.1,1.1), xpd = T, cex=0.6)
abline(h=cov_baseline[4],lty=2,lwd=2,col="red")
par(mar = c(8,4,2,2))
plt = barplot(curr_res[,6],beside = T,xaxt="n",legend=F,
ylab="Mean standardized SE",xpd=F,
main = target,col=cols)
text(plt, par("usr")[3], labels = rownames(curr_res),
srt = 45, adj = c(1.1,1.1), xpd = T, cex=0.6)
abline(h=cov_baseline[6],lty=2,lwd=2,col="red")
# preds = c();real=c()
# for(j in 1:nfolds){
# tr_x = x[folds!=j,]
# tr_y = y[folds!=j,]
# te_x = x[folds==j,]
# te_y = y[folds==j,]
# model = MTL_wrapper(tr_x,tr_y,type="Regression", Regularization="L21")
# te_preds = predict(model,te_x)
# real = rbind(real,te_y)
# preds = rbind(preds,te_preds)
# }
# diag(cor(preds,real))
# Using PLS regression
# library(pls)
# pls_model = plsr(y~x,ncomp = 5,validation="LOO")
# eval = MSEP(pls_model)
#
# y_pca = prcomp(y)
# plot(y_pca)
# explained_var = y_pca$sdev^2/sum(y_pca$sdev^2)
# y_pca_matrix = y_pca$x[,1:10]
#
# # regress out sex, weight
#
# get_explained_variance_using_PCA(x,y)
# x = apply(x,2,regress_out,covs=covs)
# y = apply(y,2,regress_out,covs=covs)
# get_explained_variance_using_PCA(x,y)
CV <-function(x){
return(sd(x)/mean(x))
}
# Get all correlation and covariance matrices
y = metabolomics_parsed_datasets[[1]]$sample_data # anchor all datasets by these ids
y_vials = colnames(y)
bid_y = as.character(merged_dmaqc_data[colnames(y),"bid"])
cov_cols = c("animal.registration.sex",
"acute.test.weight",
"acute.test.distance")
covs = merged_dmaqc_data[y_vials,cov_cols]
cov_matrices = list()
cor_matrices = list()
for(nn in names(metabolomics_parsed_datasets)){
x = metabolomics_parsed_datasets[[nn]]$sample_data
bid_x = as.character(merged_dmaqc_data[colnames(x),"bid"])
print(sum(!is.element(bid_y,set=bid_x))) # should be zero
colnames(x) = bid_x
x = x[,bid_y]
# x = apply(x,1,regress_out,covs=covs)
# x = t(x)
# cvs = apply(x,1,CV)
# print(table(cvs>0.5))
# x = x[cvs>0.5,]
# pcax = t(prcomp(t(x),scale. = F)$x[,1:10])
cov_matrices[[nn]] = cov(x)
cor_matrices[[nn]] = cor(x)
}
sapply(cor_matrices,dim)
library('evolqg');library(corrplot)
mantel_tests= MantelCor(cor_matrices)
mcors= MatrixCor(cor_matrices)
mcors = as.matrix(forceSymmetric(mcors,uplo="L"))
mantel_tests = as.matrix(
forceSymmetric(mantel_tests$probabilities,uplo = "L"))
# diag(mantel_tests)=0
ord = corrplot(t(mcors),order="hclust")
ord = rownames(ord)
corrplot(mcors[ord,ord],p.mat=mantel_tests[ord,ord],
insig = "label_sig",method="shade",
sig.level = c(.0001, .001, .01),
pch.cex = .9, pch.col = "black")
# rs_res = RandomSkewers(cor_matrices)
# corrplot(rs_res$correlations,
# p.mat=mantel_tests$probabilities,type="lower",
# insig = "label_sig",method="shade",
# sig.level = c(.0001, .001, .01),
# pch.cex = .9, pch.col = "black",tl.cex=0.6)
library(psych)
r1 = cor_matrices[[1]]
r2 = cor_matrices[[8]]
cor(r1[lower.tri(r1)],r2[lower.tri(r2)])
mantel_tests[1,8]
mantel_tests[8,1]
threshold_comp = apply_function_on_pairs(cor_matrices,
function(x,y)sum(x>0.7&y>0.7))
corrplot(threshold_comp,is.corr = F,order="hclust")
threshold_comp = apply_function_on_pairs(cor_matrices,
function(x,y)mean(diag(cor(x,y))))
corrplot(threshold_comp,is.corr = F,order="hclust")
rnaseq_data_for_difftests = load_from_bucket(
"rnaseq_data_for_difftests.RData",
"gs://bic_data_analysis/pass1a/rnaseq/"
)[[1]]
met_vs_rnaseq_cors = c()
for(nn in names(rnaseq_data_for_difftests)){
d = rnaseq_data_for_difftests[[nn]]$fpkms
covs = rnaseq_data_for_difftests[[nn]]$dmaqc_meta[,cov_cols]
cvs = apply(d,1,CV)
print(table(cvs>0.25))
x = d[cvs>0.25,]
# x = apply(x,1,regress_out,covs=covs)
# x = t(x)
bid_x = sapply(colnames(x),substr,1,5)
print(sum(!is.element(bid_y,set=bid_x))) # should be zero
colnames(x) = bid_x
x = x[,bid_y]
curr_cors = cor(x)
v = c()
for(nn2 in names(cor_matrices)){
v[nn2] = MatrixCor(curr_cors,cor_matrices[[nn2]])
}
met_vs_rnaseq_cors = rbind(met_vs_rnaseq_cors,v)
rownames(met_vs_rnaseq_cors)[nrow(met_vs_rnaseq_cors)] = nn
}
library(gplots)
heatmap.2(t(met_vs_rnaseq_cors),
trace="none",scale=NULL,mar=c(15,15),
key.xlab = "Correlation",
col=redblue(200), breaks=seq(-1,1,0.01))
metabolite_sets = list()
for(nn in names(metabolomics_parsed_datasets)){
an = metabolomics_parsed_datasets[[nn]]$row_annot
name_cols = colnames(an)[grepl("_name",colnames(an),ignore.case = T)]
metabolite_sets[[nn]] = unique(tolower(an[,name_cols[1]]))
}
overlap_matrix = apply_function_on_pairs(metabolite_sets,
function(x,y)length(intersect(x,y)))
corrplot(log(overlap_matrix+1,10),is.corr = F)
site_cor_comparison = apply_function_on_pairs(
metabolomics_parsed_datasets,
compare_two_met_sites,print_progress = T,
merged_dmaqc_data = merged_dmaqc_data,samplesize=500
)
site_cor_comparison[is.na(site_cor_comparison)]=0
corrplot(site_cor_comparison,order="hclust")
compare_two_met_sites<-function(x,y,merged_dmaqc_data,samplesize=100,...){
an_x = x$row_annot
name_colsx = colnames(an_x)[grepl("_name",colnames(an_x),ignore.case = T)]
an_y = y$row_annot
name_colsy = colnames(an_y)[grepl("_name",colnames(an_y),ignore.case = T)]
names_x = as.character(an_x[,name_colsx[1]])
names_y = as.character(an_y[,name_colsy[1]])
shared = intersect(names_x,names_y)
shared = setdiff(shared,c(NA,""))
mx = x$sample_data
my = y$sample_data
if(any(table(names_x)>1)){
mx = apply(mx,2,function(x,y)tapply(x,INDEX=y,FUN=mean,na.rm=T),y=names_x)
}
else{
rownames(mx) = as.character(names_x)
}
if(any(table(names_y)>1)){
my = apply(my,2,function(x,y)tapply(x,INDEX=y,FUN=mean,na.rm=T),y=names_y)
}
else{
rownames(my) = as.character(names_y)
}
bid_x = as.character(merged_dmaqc_data[colnames(mx),"bid"])
bid_y = as.character(merged_dmaqc_data[colnames(my),"bid"])
if(sum(!is.element(bid_y,set=bid_x))>0){
stop("BIDs do not match between x and y")
} # should be zero
colnames(mx) = bid_x
colnames(my) = bid_y
if(length(shared)>samplesize){
shared = sample(shared)[1:samplesize]
}
mx = mx[shared,bid_y]
my = my[shared,]
mx = as.matrix(mx);my = as.matrix(my)
if(nrow(mx)<2){return(NA)}
if(length(shared)<=100){
corrs = diag(cor(t(mx),t(my)))
}
else{
corrs = c()
for(i in 1:nrow(mx)){
# print(mean(mx[i,]))
corrs[rownames(mx)[i]] = cor(mx[i,],my[i,])
}
}
return(mean(corrs))
}